cd "\\won.valuta.nhh.no\home\s14958\Documents\JFQA index paper\v3"
clear
set more 1
cap log close

log using ./logs/0_sample_data_prep.txt, replace


use "S:\Katja_Aksel\raw files\Foretaksinfo_1993-2018.dta", clear
*keep only relevant variables to identidy newly established firms
keep orgnr aar stiftaar mors* selskf
rename stiftaar founded
**keep only founded between 2004 and 2018
drop if founded<2004 | founded>2018
**check if there is a parent company
replace mors_eandel=0 if mors_eandel==.
*first check on the average ownership of the parent company over the firms
bys orgnr: egen test1=mean(mors_eandel)
sort orgnr aar
bys orgnr: gen first=1 if _n==1
*second check what is the parent's ownership in the first year
gen test2=(mors_eandel>=90 & first==1)
keep if first==1
*358,782 firms established in 2003-2018
*keep only if founded as C-Corp
keep if selskf=="AS"
*322,113 firms
*exclude financial and real estate firms (identified and checked manually)
merge 1:1 orgnr using ./0_financial_and_real_estate_firms.dta
keep if _merge==1
drop _merge
**additional manual check to exclude financial firms
merge 1:1 orgnr using ./helpfile_finout.dta
keep if _merge==1
drop _merge
****178,617 after fin and real estate
*drop subsidiaries of foreign investors (with no investment activity), both parent tests must be satisfied 
drop if (mors_orgnr<800000000 & test1>=90 & test2==1)
*176,210 firms 
rename orgnr operorgnr
rename mors_orgnr orgnr
rename founded operfounded
***check when the parent companies were founded if these are just holding companies or real established parents
merge m:1 orgnr using ./0_all_population_founding_years.dta
drop if _merge==2
drop _merge
**gen a variable defining the difference in the establishment years between parent and subsidiary
gen help=founded-operfounded
*drop Norwegian subsidiaries with no investment activity
drop if (test1>=90 & test2==1 & help<-1)
drop if (test1>=90 & test2==1 & help==. & orgnr!=.)
*156,862 non-subsidiary firms left
drop aar first
rename orgnr mors_orgnr 
rename founded mors_founded
save ./0_population2004-2018.dta, replace

***identify holding companies
use ./0_population2004-2018.dta, clear
*keep only where we identify a parent (could be an established firm or a holding)
drop if test1<90
drop if mors_orgnr==. | mors_orgnr<800000000
drop if operorgnr==mors_orgnr
drop if help<-1
drop if help==.
order mors_orgnr
sort mors_orgnr operorgnr
bys mors_orgnr: gen count=_N
*10,059 firms with holding companies and not the established parent
keep operorgnr mors_orgnr mors_founded
rename mors_founded founded
save ./1a_holdings.dta, replace

**identify operating firms and holding firms (investments are going into a holding firm, but everything else is coming from the operating firm)
use ./0_population2004-2018.dta, clear
keep operorgnr operfounded
merge 1:1 operorgnr using ./1a_holdings.dta
gen operating=1 if _merge==3
keep operorgnr operfounded operating
rename operorgnr mors_orgnr 
merge 1:m mors_orgnr using ./1a_holdings.dta
gen holding=1 if _merge!=1
keep mors_orgnr operfounded founded operating holding
rename mors_orgnr orgnr 
replace operfounded=founded if founded!=.
drop founded
rename operfounded founded
duplicates drop
drop if founded<2004
***160,659 firms
***add back some VC-backed firms and other startups that were kicked out through the filter, but were manually checked
merge 1:1 orgnr using "\\won.valuta.nhh.no\home\s14958\Documents\returns paper\KMR7\1k population clean.dta", keepusing(orgnr founded)
**41 firms added back
drop _merge
save ./1a_population.dta, replace

**Potential innovative industry in the first reported year
use ./1a_population.dta, clear
merge 1:m orgnr using "S:\Katja_Aksel\raw files\Bransjeinfo_1993-2018.dta", keepusing(orgnr aar bransjek_07 bransjek_02)
drop if _merge==2
drop _merge
sort orgnr aar
gsort orgnr -aar
bys orgnr: replace bransjek_02=bransjek_02[_n-1] if bransjek_02==. & bransjek_02[_n-1]!=.
bys orgnr: replace bransjek_07=bransjek_07[_n-1] if bransjek_07==. & bransjek_07[_n-1]!=.
sort orgnr aar
bys orgnr: gen first=1 if _n==1
keep if first==1
drop first aar
***exclude potentially non-innovative industries, which were manually checked. either 07 or 02 code
merge m:1 bransjek_07 using ./non-innovative_bransjek_07_exclude.dta
drop if _merge==2
gen innov_07=0 if _merge==3 | bransjek_07==.
replace innov_07=1 if _merge==1 
drop _merge
merge m:1 bransjek_02 using ./non-innovative_bransjek_02_exclude.dta
drop if _merge==2
gen innov_02=0 if _merge==3 | bransjek_02==.
replace innov_02=1 if _merge==1 
drop _merge
drop bransjek*
gen innovindustry=innov_02+innov_07
replace innovindustry=1 if innovindustry>1
drop innov_*
save ./1c_population_industry.dta, replace

***location of board members in the first reported year
use "S:\Katja_Aksel\raw files\1998-2017board data.dta", clear
merge m:1 orgnr using ./1c_population_industry.dta
*15,745 firms are not in the board data
keep if _merge==3
drop _merge
bys orgnr: egen help=min(yr)
keep if yr==help
keep orgnr com_postnr ind_postnr
duplicates drop
gen diff=abs(ind_postnr-com_postnr)
drop if diff==. | diff<5
bys orgnr: egen help=max(diff)
*The numbers start at 00 and increase with the distance from the capital city Oslo. Look at difference bigger than 1500*
drop if help<1500
keep orgnr
duplicates drop
merge 1:1 orgnr using ./1c_population_industry.dta
gen firstyrfarboard=(_merge==3)
drop _merge
save ./1d_population_farboard.dta, replace


**Located in an innovation hub in the first reported year
use "S:\Katja_Aksel\raw files\Foretaksinfo_1993-2018.dta", clear
keep orgnr aar poststed
merge m:1 orgnr using ./1d_population_farboard.dta
drop if _merge==1
drop _merge
gsort orgnr -aar
*if the first observation is not filled in, roll down
bys orgnr: replace poststed=poststed[_n-1] if poststed=="" & poststed[_n-1]!=""
*4 changes
sort orgnr aar
bys orgnr: gen first=1 if _n==1
keep if first==1
drop first aar
gen innhub=(poststed=="OSLO" | poststed=="BERGEN" | poststed=="TRONDHEIM" | poststed=="STAVANGER")
drop poststed
save ./1e_population_hub.dta, replace

******EXIT EVENTS
use "S:\Katja_Aksel\raw files\Foretaksinfo_1993-2018.dta", clear
keep orgnr aar bors_aks konkaar
merge m:1 orgnr using ./1e_population_hub.dta
drop if _merge==1
drop _merge
replace konkaar=0 if konkaar==.
replace bors_aks=0 if bors_aks==.
***identify officially filed bankruptcies and ipos from the registry file
gen ipo=aar if bors==1
*the year of ipo is the earliest observable year on the stock exchange
bys orgnr: egen ipoyear=min(ipo)
*the year of bankruptcy is the latest filing of bankruptcy
bys orgnr: egen bankruptyear=max(konkaar)
drop ipo bors aar konkaar
duplicates drop
**double-check and add information from the bankrutpcy file
merge 1:m orgnr using "S:\Katja_Aksel\raw files\konkurser_1993-18.dta", keepusing(konkdato)
drop if _merge==2
drop _merge
replace konkdat=0 if konkdato==.
*take only the latest observable date
bys orgnr: egen help=max(konkdato)
drop konkdato
duplicates drop
replace bankruptyear=year(help) if bankruptyear==0 & help!=0
drop help
replace bankruptyear=. if bankruptyear==0 | bankruptyear>2018
***identify mergers from the merger file
merge 1:m orgnr using "S:\Katja_Aksel\raw files\NHH_fusjoner_rydd_sum_1997_2020.dta", keepusing(tim_fuaar)
drop if _merge==2
drop _merge
replace tim_fuaar=. if tim_fuaar>2018 | tim_fuaar<=founded
bys orgnr: egen mergeryear=min(tim_fuaar)
drop tim_fuaar
duplicates drop
save ./1f_population_bankmergipo.dta, replace


**Identify liquidations without bankruptcy
************************************************
***prepare the realization file, looking for transaction type liquidation here
use "S:\Katja_Aksel\raw files\1088_realization 2004-2018 all.dta", clear
rename aksje_orgnr orgnr
merge m:1 orgnr using ./1a_population.dta, keepusing (orgnr founded)
keep if _merge==3
*99,213 firms not in the realization file
drop _merge
duplicates drop
	gen	 		syear=substr(realisert_dato,-4,4)
	destring	syear, replace
	gen 		smonth=substr(realisert_dato,-7,2)
	destring	smonth, replace
	gen 		sday=substr(realisert_dato,-10,2)
	destring	sday, replace
	gen			sdate=mdy(smonth,sday,syear)
	format		sdate %td

	gen	 		pyear=substr(ervervet_dato,-4,4)
	destring	pyear, replace
	gen 		pmonth=substr(ervervet_dato,-7,2)
	destring	pmonth, replace
	gen 		pday=substr(ervervet_dato,-10,2)
	destring	pday, replace
	gen			pdate=mdy(pmonth,pday,pyear)
	format		pdate %td
	drop realisert_dato ervervet_dato *month *day

	gen investor_id=akt_lopenr
	replace investor_id=substr(akt_lopenr,-9,9) if akt_sy_type=="U"
	destring investor_id, replace
	format %14.0g investor_id
	drop akt_lopenr
	order orgnr sdate pdate aar investor_id 
	sort orgnr investor_id sdate pdate aar aksje_aksjeklasse realisasjon_id

*check if the investors are holding companies and replace by ultimate investor
merge m:1 investor_id using "\\won.valuta.nhh.no\home\s14958\Documents\returns paper\holding companies\holdings.dta"
drop if _merge==2
replace investor_id=actual_investor_id if _merge==3
replace aksjonaer_navn=actual_investor_name if _merge==3
replace akt_sy_type="P" if _merge==3
drop _merge actual*
save ./1_realization_file.dta, replace

use ./1_realization_file.dta, clear
keep orgnr realisasjon_type syear
**keep only liquidation transactions
keep if realisasjon_type=="L" | realisasjon_type=="P" | realisasjon_type=="PLS"
duplicates drop
*17,814 firm-years
*take the earliest year
bys orgnr: egen liquidation=min(syear)
keep orgnr liquidation
duplicates drop
merge 1:1 orgnr using ./1f_population_bankmergipo.dta
drop _merge
*if we already observe bankruptcy or merger, we do not need to identify shares liquidation anymore, they are a part of bankruptcy or merger
replace liquidation=. if liquidation==bankruptyear & liquidation!=. & bankruptyear!=.
replace liquidation=. if liquidation==mergeryear & liquidation!=. & mergeryear!=.
replace liquidation=. if liquidation==ipoyear & liquidation!=. & ipoyear!=.
save ./1g_population_liquid.dta, replace


****now identify acquisitions from the registry file by looking at the relationship to parent companies
use ./1g_population_liquid.dta, clear
merge 1:m orgnr using "S:\Katja_Aksel\raw files\Foretaksinfo_1993-2018.dta", keepusing (aar mors*)
drop if _merge==2
drop _merge
order orgnr aar
sort orgnr aar
bysort orgnr: gen first=_n
sort orgnr aar
*check if the firm receives a parent after the first observation (and did not have a parent before)
bysort orgnr: gen acq=aar if (mors_eandel[_n-1]==. | mors_eandel[_n-1]<90) & mors_eandel>=90 & mors_eandel!=. & first!=1
*take the earliest year
bysort orgnr: egen acquisition=min(acq)
*drop if no acquisitions observed
drop if acquisition==.
keep if aar==acquisition
*drop if the firm owns itself all shares
drop if orgnr==mors_orgnr
*16 observations dropped
*drop if the parent is not a firm, but an individual, then it is not an acquisition
drop if mors_orgnr>999999999
*8,079 dropped, 8,302 acquisitions
keep orgnr mors_orgnr acquisition
save ./helpfile_acquisitions.dta, replace

use ./helpfile_acquisitions.dta, clear
*check when the parent firm is founded (maybe a holding company)
keep mors_orgnr
duplicates drop
rename mors_orgnr orgnr
drop if orgnr==0
*1 observation dropped
merge 1:m orgnr using "S:\Katja_Aksel\raw files\Foretaksinfo_1993-2018.dta", keepusing (stiftaar)
drop if _merge==2
drop _merge
bys orgnr: egen morsfounded=min(stiftaar)
keep orgnr morsfounded
duplicates drop
rename orgnr mors_orgnr
*now merge back
merge 1:m mors_orgnr using ./helpfile_acquisitions.dta
drop _merge	
*drop if the acquisition happens in the same year or one year after the parent company is established, then it is a holding company
drop if acquisition==morsfounded | acquisition==morsfounded+1
*1,210 observations dropped
keep orgnr acquisition
drop if acquisition>2018
merge 1:1 orgnr using ./1g_population_liquid.dta
drop _merge 
*if it is a merger year, then declare as a merger
replace acquisition=. if mergeryear==acquisition & acquisition!=.
*1 change
*if acquisition and liquidation are in the same year, declare as acquisition
replace liquidation=. if liquidation==acquisition & liquidation!=. & acquisition!=.
*59 changes
save ./1h_population_exit.dta, replace

**identify zombies and disappearances from the accounting file - financial reporting every year. all firms must submit their financial accounts every year
use "S:\Katja_Aksel\raw files\1992-2018accounting.dta", clear
keep orgnr aar salgsinn
*replace negative and missing values with zeros
replace salgsinn=0 if salgsinn==. | salgsinn<0
merge m:1 orgnr using ./1h_population_exit.dta
*everything merged in
keep if _merge==3
drop _merge
sort orgnr aar
bys orgnr: gen last=aar if _n==_N
bys orgnr: egen disappearyear=min(last)
drop last
replace disappearyear=. if disappearyear==2018
replace disappearyear=disappearyear+1
**identify as a zombie if still there (disappearyear==.) but 5 years no revenues
gen help=1 if disappearyear==. & aar==2018 & salgsinn==0
replace help=1 if disappearyear==. & aar==2017 & salgsinn==0
replace help=1 if disappearyear==. & aar==2016 & salgsinn==0
replace help=1 if disappearyear==. & aar==2015 & salgsinn==0
replace help=1 if disappearyear==. & aar==2014 & salgsinn==0
replace help=0 if help==.
bys orgnr: egen sum=sum(help)
gen zombie=(sum==5)
drop help sum aar salgsinn
duplicates drop
*replace as 0 or . if other exit events are there due to which firms become zombie or disappear
replace zombie=0 if zombie==1 & (ipoyear!=. | bankruptyear!=. | mergeryear!=.)
*35 changes, the most of them are bankruptcies, 5 ipo and 2 merger
replace disappear=. if disappear!=. & (ipoyear!=. | bankruptyear!=. | mergeryear!=.)
*23,408 changes
save ./1i_population_zomb.dta, replace


**English name in the first year
use "S:\Katja_Aksel\raw files\Foretaksinfo_1993-2018.dta", clear
keep orgnr navn aar
merge m:1 orgnr using ./1i_population_zomb.dta
*all firms merged in
drop if _merge==1
drop _merge
sort orgnr aar
bys orgnr: gen first=1 if _n==1
gsort orgnr -aar
bys orgnr: replace navn=navn[_n-1] if navn=="" & navn[_n-1]!=""
*1 change
sort orgnr aar
keep if first==1
recast str navn
**google translate, enin AI detection + manual check on top
merge 1:1 orgnr navn using ./language_detection_20210423.dta, keepusing(english_v1)
*all merged in
drop if _merge==2
drop _merge
gen english=(english_v1=="yes")
drop english_v1 navn first aar
save ./1j_population_english.dta, replace

**handle the holdings - we will consider the equity transactions of holding companies but the outcomes of the real operating firms
use ./1a_holdings.dta, clear
rename operorgnr orgnr
drop founded
merge m:1 orgnr using ./1j_population_english.dta
**keep only actually operating companies 
keep if _merge==3
drop _merge holding operating
*for each holding company generate the operating company outcome
sort mors_orgnr
bys mors_orgnr: egen openglish=max(english)
bys mors_orgnr: egen opacquisition=min(acquisition)
bys mors_orgnr: egen opliquidation=min(liquidation)
bys mors_orgnr: egen opfounded=min(founded)
bys mors_orgnr: egen opinnovindustry=max(innovindustry)
bys mors_orgnr: egen opfirstyrfarboard=max(firstyrfarboard)
bys mors_orgnr: egen opinnhub=max(innhub)
bys mors_orgnr: egen opipoyear=min(ipoyear)
bys mors_orgnr: egen opbankruptyear=min(bankruptyear)
bys mors_orgnr: egen opmergeryear=min(mergeryear)
bys mors_orgnr: egen opdisappearyear=min(disappearyear)
bys mors_orgnr: egen opzombie=max(zombie)
keep mors_orgnr op* 
duplicates drop
*8,166 holdings left
rename mors_orgnr orgnr
*merge back
merge 1:1 orgnr using ./1j_population_english.dta
*drop holdings that are not in our sample
drop if _merge==1
*56 observations dropped
*drop operating firms that are now in the holding observations
drop if operating==1
*150,641 firms after handling holdings
drop holding operating 

replace english=openglish if openglish>english & _merge==3
replace founded=opfounded if opfounded<founded & _merge==3
replace acquisition=opacquisition if opacquisition<acquisition & _merge==3
replace liquidation=opliquidation if opliquidation<liquidation & _merge==3
replace ipoyear=opipoyear if opipoyear<ipoyear & _merge==3
replace bankruptyear=opbankruptyear if opbankruptyear<bankruptyear & _merge==3
replace mergeryear=opmergeryear if opmergeryear<mergeryear & _merge==3
replace innovindustry=opinnovindustry if opinnovindustry>innovindustry & _merge==3
replace firstyrfarboard=opfirstyrfarboard if opfirstyrfarboard>firstyrfarboard & _merge==3
replace innhub=opinnhub if opinnhub>innhub & _merge==3
replace zombie=opzombie if opzombie>zombie & _merge==3
replace disappearyear=opdisappearyear if opdisappearyear<disappearyear & _merge==3
drop _merge op*
*150,641 firms
*double-check that we do not have any pe funds as holding company now
rename orgnr investor_id
merge 1:1 investor_id using ./0_all_pe_funds.dta, keepusing (investor_id)
*9 firms have slipped in, drop them
keep if _merge==1
drop _merge
rename investor_id orgnr
*150,632 firms
save ./1j2_population.dta, replace

****identify additional IPOs not identified before
use ./1j2_population.dta, clear
keep orgnr
merge 1:m orgnr using "S:\Katja_Aksel\raw files\Foretaksinfo_1993-2018.dta", keepusing(aar selskf)
keep if _merge==3
drop _merge
sort orgnr aar
bys orgnr: gen first=1 if _n==1
gen change=1 if selskf=="ASA" & first==1
bys orgnr: replace change=1 if change==. & selskf=="ASA" & selskf[_n-1]!="ASA"
keep if change==1
rename aar ipoyear2
keep orgnr ipoyear2
merge 1:1 orgnr using ./1j2_population.dta
*104 firms
drop _merge
sort orgnr
order orgnr ipoyear ipoyear2
replace ipoyear=ipoyear2 if ipoyear==. & ipoyear2!=.
*55 changes
replace ipoyear=ipoyear2 if ipoyear!=. & ipoyear>ipoyear2
*15 changes
drop ipoyear2


gen sample=((english==1 & innhub==1) | (english==1 & innov==1) | (english==1 & firstyr==1) | (innhub==1 & innov==1) | (innhub==1 & firstyr==1) | (innov==1 & firstyr==1))
tab sample
*41.35% are sample firms now

sort orgnr
*first event counts (only one exit event per firm), main exit events count (zombie, disappear, liquidation are residuals)
replace zombie=0 if zombie==1 & (disappearyear!=. | liquidation!=. | acquisition!=. | ipoyear!=. | bankruptyear!=.  | mergeryear!=.)
*1,070 changes
replace disappearyear=. if disappearyear!=. & (liquidation!=. | acquisition!=. | ipoyear!=. | bankruptyear!=. | mergeryear!=.)
*15,175 changes
replace liquidation=. if liquidation!=. & (acquisition!=. | ipoyear!=. | bankruptyear!=. | mergeryear!=.)
*1,039 changes
replace bankruptyear=. if bankruptyear!=. & (acquisition!=. | ipoyear!=. | mergeryear!=.)
*538 changes
replace mergeryear=. if mergeryear!=. & ((mergeryear>acquisition & mergeryear!=. & acquisition!=.) | (mergeryear>=ipoyear & mergeryear!=. & ipoyear!=.))
*1,480 changes
replace acquisition=. if acquisition!=. & ((acquisition>=mergeryear & acquisition!=. & mergeryear!=.) | (acquisition>=ipoyear & acquisition!=. & ipoyear!=.))
*59 changes
replace ipoyear=. if ipoyear!=. & ((ipoyear>mergeryear & ipoyear!=. & mergeryear!=.) | (ipoyear>acquisition & acquisition!=. & ipoyear!=.))
*2 change

***generate the year of exit

gen exityear=2018 if disappearyear==. & bankruptyear==. & liquidation==. & mergeryear==. & acquisition==. & ipoyear==.
replace exityear=disappearyear if disappearyear!=. & bankruptyear==. & liquidation==. & mergeryear==. & acquisition==. & ipoyear==.
replace exityear=bankruptyear if bankruptyear!=. & liquidation==. & mergeryear==. & acquisition==. & ipoyear==.
replace exityear=liquidation if bankruptyear==. & liquidation!=. & mergeryear==. & acquisition==. & ipoyear==.
replace exityear=mergeryear if bankruptyear==. & liquidation==. & mergeryear!=. & acquisition==. & ipoyear==.
replace exityear=acquisition if bankruptyear==. & liquidation==. & mergeryear==. & acquisition!=. & ipoyear==.
replace exityear=ipoyear if bankruptyear==. & liquidation==. & mergeryear==. & acquisition==. & ipoyear!=.
tab exityear

*generate an exit event variable: 1 still alive 2 zombie 3 bankruptcy 4 liquidation 5 disappearance 6 merger 7 acquisition 8 ipo
gen final=1 if disappearyear==. & bankruptyear==. & liquidation==. & mergeryear==. & acquisition==. & ipoyear==. & zombie==0
replace final=2 if disappearyear==. & bankruptyear==. & liquidation==. & mergeryear==. & acquisition==. & ipoyear==. & zombie==1
replace final=3 if (bankruptyear!=.)
replace final=4 if (liquidation!=.)
replace final=5 if disappearyear!=.
replace final=6 if (mergeryear!=.)
replace final=7 if (acquisition!=.)
replace final=8 if (ipoyear!=.)
*drop if ipo year is the year of the founding or after
drop if founded==ipoyear
*37 observations dropped
drop if founded+1==ipoyear
*23 observations
gen help=exityear-founded
drop if help<0
*1 observation dropped
drop help
*150,571 firms, 28 firms were not newly established
merge 1:m orgnr using "S:\Katja_Aksel\raw files\Bransjeinfo_1993-2018.dta", keepusing(sector)
drop if _merge==2
drop _merge
bys orgnr: egen industry=mode(sector), minmode
drop sector 
duplicates drop
replace industry=10 if industry==.
save ./1k_population.dta, replace

*************prepare the purchase file and check whether all firms are there
****purchase file
use "S:\Katja_Aksel\raw files\1088_purchase 2004-2018 all.dta", clear
rename aksje_orgnr orgnr
merge m:1 orgnr using ./1k_population.dta
*1,415 firms not in the file
keep if _merge==3
drop _merge
	gen investor_id=akt_lopenr
	replace investor_id=substr(akt_lopenr,-9,9) if akt_sy_type=="U"
	destring investor_id, replace
	format %14.0g investor_id
	drop akt_lopenr

gen	 		pyear=substr(erverv_dato,-4,4)
destring	pyear, replace
gen 		pmonth=substr(erverv_dato,-7,2)
destring	pmonth, replace
gen 		pday=substr(erverv_dato,-10,2)
destring	pday, replace
gen			pdate=mdy(pmonth,pday,pyear)
format		pdate %td
drop erverv_dato pday pmonth
order orgnr pdate aar investor_id aksje_id akt_sy_type
sort orgnr pdate investor_id aar aksje_aksjeklasse aksje_id erverv_id

*check if these are holding companies and replace by the actual investor
merge m:1 investor_id using "\\won.valuta.nhh.no\home\s14958\Documents\returns paper\holding companies\holdings.dta"
drop if _merge==2
replace investor_id=actual_investor_id if _merge==3
replace aksjonaer_navn=actual_investor_name if _merge==3
replace akt_sy_type="P" if _merge==3
drop _merge actual*
sort orgnr pdate investor_id aar aksje_aksjeklasse aksje_id erverv_id
save ./1_purchase_file.dta, replace

**preparing the file
use ./1_purchase_file.dta, clear
*6million obs
**drop if the firm owns its own shares
drop if investor_id==orgnr
*22,440 observations dropped
**drop if the number of shares <=0
drop if erverv_antall<=0
*53,511 observations dropped
*delete non-necessary, tax authority related variables
drop aksje_aksjeklasse_besk aksje_selskap_navn aksjonaer_navn erverv_akk_risk erverv_skjerming_i_aar erverv_ubenyttet_skjerm erverv_aarets_skj_gr_lag erverv_aarets_skj_fradrag erverv_skj_fradrag_t_framf erverv_kapital_korr erverv_samlet_skj_fradrag kapital_korr_inar erverv_fotnote
duplicates drop
*99 observations deleted
rename aksje_aksjeklasse aksjeklasse
keep orgnr pdate aar investor_id aksje_id akt_sy_type aksjeklasse erverv*
*klasse 3 is ordinary, everything else some sort of preferred
replace aksjeklasse=1 if aksjeklasse==3
replace aksjeklasse=2 if aksjeklasse!=1
tab aksjeklasse
*97.74% are ordinary shares
sort orgnr aar aksjeklasse investor_id pdate erverv_type erverv_id
	*each other following erverv_id are not double-entries, but separate parts of one transaction
	bysort orgnr aar aksjeklasse investor_id pdate erverv_type: gen test=erverv_id-erverv_id[_n-1]
	sum test, d
/*
976,474 observations have several entries on the same date with the same investor of the same purchase type and of the same share class reported in the same year, however the cut is pretty clear it is either a couple of seconds or another day. We assume if two transactions are typed in consequently after each other with the difference of less than 10000 seconds (less than 3 hours), then it is intentionally two entries. Otherwise these are correction entries.
 
                           test
-------------------------------------------------------------
      Percentiles      Smallest
 1%            1              1
 5%            1              1
10%            1              1       Obs             976,474
25%            1              1       Sum of wgt.     976,474

50%           12                      Mean            5206884
                        Largest       Std. dev.       6985974
75%     1.17e+07       2.08e+08
90%     1.60e+07       2.08e+08       Variance       4.88e+13
95%     1.80e+07       2.08e+08       Skewness       1.227973
99%     2.07e+07       2.16e+08       Kurtosis       11.08384

*/
	replace erverv_id=erverv_id[_n-1] if test<10000
	*summarize them
	bys orgnr aar aksjeklasse investor_id pdate erverv_type erverv_id: egen shares3112=sum(erverv_antall_3112)
	bys orgnr aar aksjeklasse investor_id pdate erverv_type erverv_id: egen shares=sum(erverv_antall)
	bys orgnr aar aksjeklasse investor_id pdate erverv_type erverv_id: egen amount=sum(erverv_beloep)
	bys orgnr aar aksjeklasse investor_id pdate erverv_type erverv_id: egen currentprice=mean(erverv_anskaff_verdi)
	drop erverv_antall_3112 erverv_beloep erverv_antall erverv_anskaff_verdi test
	duplicates drop
*567,818 observations dropped
	sort orgnr aar aksjeklasse investor_id pdate erverv_type erverv_id
	*exclude double-entries by taking the latest entry/tax return with the hightest erverv_id in a given year (shares are reported every year! So we check the consistency within each year first)
	bysort orgnr aar aksjeklasse investor_id pdate erverv_type: gen lauf=_n
	bysort orgnr aar aksjeklasse investor_id pdate erverv_type: egen lauf2=max(lauf)
	drop if lauf<lauf2
*408,656 observations dropped
	drop lauf*
	sort orgnr aksjeklasse investor_id erverv_type pdate aar 
	*take the latest tax return of each transaction (across the years)
	bys orgnr aksjeklasse investor_id erverv_type pdate: egen help=max(aar)
	keep if aar==help
*2,578,173 observations dropped
	drop help* aksje_id erverv_id
	sort orgnr investor_id pdate erverv_type aksjeklasse 
	gen pyear=year(pdate)
	drop if pyear<2003
*12,258 observations dropped
*608,071 transactions
save ./1a_purchase_file.dta, replace

***CHECK - not matched firms
use ./1a_purchase_file.dta, clear
merge m:1 investor_id using ./0_all_pe_funds.dta, keepusing(buyout)
*5,175 vc observations
drop if _merge==2
gen vc=(_merge==3)
bys orgnr: egen vcbacked=max(vc)
keep orgnr vcbacked
duplicates drop
*148,464 firms
*exclude manually checked old firms or firms with strange transactions
merge 1:1 orgnr using ./helpfile_excludeoldfirms.dta
keep if _merge==1
drop _merge
tab vcbacked
*124,348 firms
*928 firms
merge m:1 orgnr using ./1k_population.dta
keep if _merge==3
drop _merge
save ./1k_population_clean.dta, replace


use ./1a_purchase_file.dta, replace
merge m:1 orgnr using ./1k_population_clean.dta
keep if _merge==3
drop _merge
*1,158,928 observations
**keep only real purchases
keep if erverv_type=="K" | erverv_type=="N" | erverv_type=="T" | erverv_type=="NF"
*9,370 observations dropped
gen page=pyear-founded+1
**drop secondary trades before age of 0
drop if erverv_type=="K" & page<0
*2,154 dropped
tab page

*keep only purchases before the exit event
*drop all after the exit year
drop if pyear>exityear 
*545,548 dropped
* drop those in the year of acquisition, merger or IPO
drop if pyear==exityear & (final==6 | final==7 | final==8)
*35,468 dropped
save ./1b_purchase_file.dta, replace


****now adjust the realization file
use ./1_realization_file.dta, clear
merge m:1 orgnr using ./1k_population_clean.dta
keep if _merge==3
drop _merge
*1,291,237 observations
*drop if the firm owns its own shares
drop if investor_id==orgnr
*11,395 observations dropped
*drop if for some reason realization date is stated before the purchase date
	drop if sdate<=pdate
*166,963 observations dropped
*drop not relevant, tax authority-related variables
	drop aksjonaer_navn aksje_selskap_navn aksje_aksjeklasse_besk ///
	realisasjon_omkostninger realisasjon_div_korreksjon realisasjon_rest_skjerming ///
	realisasjon_fotnote realisasjon_benyttet_skjerming realisasjon_gevinst realisasjon_gevinst_uten_skjrm
duplicates drop
	rename aksje_aksjeklasse aksjeklasse
	replace aksjeklasse=1 if aksjeklasse==3
	replace aksjeklasse=2 if aksjeklasse!=1
tab aksjeklasse
*99.04% ordinary shares
	duplicates drop
	sort orgnr investor_id sdate pdate aksjeklasse erverv_type realisasjon_type realisasjon_id
	*each other following realisasjon_id are not double-entries, but separate parts of one transaction
	bysort orgnr investor_id sdate pdate aksjeklasse erverv_type realisasjon_type: gen test=realisasjon_id-realisasjon_id[_n-1]
	replace realisasjon_id=realisasjon_id[_n-1] if test<10000
	*summarize them
	bys orgnr investor_id sdate pdate aksjeklasse erverv_type realisasjon_type realisasjon_id: egen rshares=sum(realisert_antall)
	bys orgnr investor_id sdate pdate aksjeklasse erverv_type realisasjon_type realisasjon_id: egen rpuramount=sum(realisasjon_anskaffelses)
	bys orgnr investor_id sdate pdate aksjeklasse erverv_type realisasjon_type realisasjon_id: egen ramount=sum(realisasjon_samlet)
	drop realisert_antall realisasjon_anskaffelses_verdi realisasjon_samlet_vederlag test
	duplicates drop
*275,398 observations dropped
	sort orgnr investor_id sdate pdate aksjeklasse erverv_type realisasjon_type realisasjon_id
	*exclude double-entries by taking the latest entry with the highest realisasjon_id
	bysort orgnr investor_id sdate pdate aksjeklasse erverv_type realisasjon_type: gen lauf=_n
	bysort orgnr investor_id sdate pdate aksjeklasse erverv_type realisasjon_type: egen lauf2=max(lauf)
	drop if lauf<lauf2
*68,838 observations dropped
	drop lauf* realisasjon_id aksje_id
	duplicates drop
	order orgnr investor_id akt_sy_type pdate pyear erverv_type realisasjon_type aksjeklasse rshares rpuramount ramount
	sort orgnr investor_id pdate erverv_type realisasjon_type aksjeklasse 
**drop observations with an erroneous negative realization amount
	drop if ramount<0
*140 observations dropped
*replace the purchase amount with the right sign if inserted with a negative sign
	replace rpuramount=rpuramount*(-1) if rpuramount<0
*2,511 observations dropped

drop if year(pdate)<2003
*6 observations dropped
*keep only real purchases
keep if erverv_type=="K" | erverv_type=="N" | erverv_type=="T" | erverv_type=="NF"
*2,600 observations dropped
*keep only real realizations
keep if realisasjon_type=="R" | realisasjon_type=="L" | realisasjon_type=="P" | realisasjon_type=="PLS" 
*7,477 observations

gen page=pyear-founded+1
drop if erverv_type=="K" & page<0
*410 observations dropped

*keep only purchases before the exit event
drop if pyear>exityear 
*299,647 observations dropped
drop if pyear==exityear & (final==6 | final==7 | final==8)
*23,769 observations dropped

duplicates drop
compress
save ./1b_realization_file.dta, replace

***double-check if there are any purchases in the realization file not captured before, simply by looking at comparing purchase transactions
use ./1b_realization_file.dta, clear 
bys orgnr investor_id pdate erverv_type aksjeklasse: egen pshares=sum(rshares)
bys orgnr investor_id pdate erverv_type aksjeklasse: egen pamount=sum(rpuramount)
keep orgnr investor_id pdate erverv_type aksjeklasse akt_sy_type pshares pamount 
duplicates drop
*347,286 purchases
merge 1:1 orgnr investor_id pdate erverv_type aksjeklasse using ./1b_purchase_file.dta
*23 not merged in, 23 changes
keep orgnr investor_id pdate erverv_type aksjeklasse akt_sy_type pshares pamount shares amount
sort orgnr investor_id pdate erverv_type aksjeklasse 
replace shares=pshares if shares==. & pshares!=.
replace amount=pamount if amount==. & pamount!=.
drop pshares pamount
merge m:1 orgnr using ./1k_population_clean.dta, keepusing(orgnr)
*all matched
keep if _merge==3
drop _merge
***identify VC transactions
merge m:1 investor_id using ./0_all_pe_funds.dta, keepusing(buyout)
*4,212 vc-backed transactions
	drop if _merge==2
gen vcinvestor=(_merge==3)
	drop _merge
save ./2a_all_purchases.dta, replace 

***double-check if there are any purchases in the realization file not captured before, sales transactions are purchase transactions for someone else. add them if not there
use ./1b_realization_file.dta, clear 
keep if realisasjon_type=="R"
keep orgnr sdate aksjeklasse rshares ramount
bys orgnr sdate aksjeklasse: egen fshares=sum(rshares)
bys orgnr sdate aksjeklasse: egen famount=sum(ramount)
keep orgnr sdate fshares famount aksjeklasse
duplicates drop
rename sdate pdate
merge 1:m orgnr pdate aksjeklasse using ./2a_all_purchases.dta
*19,396 new purchases, 139,280 were in both files
sort orgnr pdate investor_id erverv_type aksjeklasse
replace shares=fshares if shares==. 
replace amount=famount if amount==. 
replace erverv_type="K" if erverv_type==""
drop fshares famount _merge

merge m:1 orgnr using ./1k_population_clean.dta
drop _merge
*double-check: keep only purchases before the exit event
drop if year(pdate)>exityear 
*7,505 observations dropped
drop if year(pdate)==exityear & (final==6 | final==7 | final==8)
*6,545 observations dropped
*other checks
drop if year(pdate)<2003
*0 dropped
gen page=year(pdate)-founded+1
**drop secondary trades before age of 0
drop if erverv_type=="K" & page<0
*0 dropped
save ./2b_all_purchases.dta, replace 

use ./2b_all_purchases.dta, clear
keep if erverv_type=="T" | erverv_type=="N" | erverv_type=="NF"
gen pyear=year(pdate)
collapse (sum) shares, by (orgnr pyear)
save ./helpfile_shares_per_year.dta, replace

use ./2b_all_purchases.dta, clear
keep if erverv_type=="T" | erverv_type=="N" | erverv_type=="NF"
rename shares dateshares
collapse (sum) dateshares, by (orgnr pdate)
save ./helpfile_shares_per_date.dta, replace


*****FIRM SHARES OUTSTANDING*****
********************************************************************************
use "S:\Katja_Aksel\raw files\1086_firm ownership_2004-2018 all.dta", clear
merge m:1 orgnr using ./1k_population_clean.dta
	drop if _merge==1
	drop _merge
*everything merged in
	sort orgnr aar oppgave_nr siste_oppgave
	duplicates drop
*0 duplicates, 1mill observations
	*siste-oppgave indicates the latest/most current entry
	keep if siste_oppgave=="J"
*212,120 dropped
	*aksje_id=-1 is the missing value
	drop if aksje_id==-1
*33,064 dropped
	*replace erroneous entries by 0
	replace aksje_tot_ant_0101=0 if aksje_tot_ant_0101==-1
*4,724 changes
	replace aksje_tot_ant_3112=0 if aksje_tot_ant_3112==-1
*1,035 changes
	replace aksje_tot_kapital_0101=0 if aksje_tot_kapital_0101==-1
*4,667 changes
	replace aksje_tot_kapital_3112=0 if aksje_tot_kapital_3112==-1
*1,034 changes
	replace tot_kapital_0101=0 if tot_kapital_0101==-1
*4,631 changes
	replace tot_kapital_3112=0 if tot_kapital_3112==-1
*998 changes
	*differentiate between ordindary and other shares, mismatch in classification otherwise later on
	replace aksje_aksjeklasse=1 if aksje_aksjeklasse==3
	replace aksje_aksjeklasse=2 if aksje_aksjeklasse!=1
tab aksje_aksjeklasse
*98.47% are ordinary shares
	order orgnr aar oppgave_id 
	sort orgnr aar aksje_aksjeklasse oppgave_id
***keep only relevant variables, drop tax authority-related variables
	keep aar orgnr aksje_id oppgave_id oppgave_status aksje_aksjeklasse aksje_tot_ant_0101 aksje_tot_ant_3112 
	duplicates drop	
*5 observations dropped
	rename aksje_aksjeklasse aksjeklasse
	sort orgnr aar aksjeklasse oppgave_id
	bys orgnr aar aksjeklasse: gen help=_n
	bys orgnr aar aksjeklasse: egen help2=max(help)
	bys orgnr aar aksjeklasse: egen help3=max(oppgave_id)
	drop if help2>1 & oppgave_id!=help3
*3,790 observations dropped
	drop help*
**double-check how many reporting id per firm/year/share class
bys orgnr aar aksjeklasse oppgave_id aksje_id: gen lauf=_n
bys orgnr aar aksjeklasse oppgave_id aksje_id: egen lauf2=max(lauf)
tab lauf2
*2,626 observations are multiple (0.35%)
bys orgnr aar aksjeklasse oppgave_id aksje_id: egen help=mean(aksje_tot_ant_0101)
bys orgnr aar aksjeklasse oppgave_id aksje_id: egen help2=mean(aksje_tot_ant_3112)
gen test1=(aksje_tot_ant_0101==help) if lauf2>1
gen test2=(aksje_tot_ant_3112==help2) if lauf2>1
count if (test1==0 | test2==0) & lauf2>1
*7 firms are affected
**Status KF is the prepation for the tax filing, status OK, is return is filed; so in these cases there is always one KF observation and one OK observation; we keep OK
drop if oppgave_status!="OK" & (test1==0 | test2==0) & lauf2>1
**8 observations dropped (1 firm with two different shares)
drop oppgave_status lauf* test* help*
duplicates drop
*1,305 duplicates dropped
*summarize all other entries in one observation with the same reporting id in case several observations
	bys orgnr aar aksjeklasse: gen help=_n
	bys orgnr aar aksjeklasse: egen help2=max(help)
	bys orgnr aar aksjeklasse: egen help3=max(oppgave_id)
	bys orgnr aar aksjeklasse oppgave_id: egen help4=sum(aksje_tot_ant_0101)
	bys orgnr aar aksjeklasse oppgave_id: egen help5=sum(aksje_tot_ant_3112)
	replace aksje_tot_ant_0101=help4 if help2!=1
*3,118 changes
	replace aksje_tot_ant_3112=help5 if help2!=1
*3,451 changes
	drop help* aksje_id oppgave_id
	duplicates drop
*2,464 observations dropped
bys orgnr aar: egen firmshares3112=sum(aksje_tot_ant_0101)
bys orgnr aar: egen firmshares0101=sum(aksje_tot_ant_0101)
keep orgnr aar firmshares*
duplicates drop
*745,035 observations
tab aar
/*
       AAR |      Freq.     Percent        Cum.
------------+-----------------------------------
       2004 |      6,329        0.85        0.85
       2005 |     14,060        1.89        2.74
       2006 |     20,478        2.75        5.49
       2007 |     26,753        3.59        9.08
       2008 |     30,960        4.16       13.23
       2009 |     34,145        4.58       17.81
       2010 |     37,553        5.04       22.86
       2011 |     42,012        5.64       28.49
       2012 |     51,993        6.98       35.47
       2013 |     60,991        8.19       43.66
       2014 |     69,317        9.30       52.96
       2015 |     78,530       10.54       63.50
       2016 |     87,517       11.75       75.25
       2017 |     95,011       12.75       88.00
       2018 |     89,386       12.00      100.00
------------+-----------------------------------
      Total |    745,035      100.00

*/
sort orgnr aar
**roll down from the beginning value of the next year (in case there were some adjustments in shares)
bys orgnr: replace firmshares3112=firmshares0101[_n+1] if firmshares3112!=firmshares0101[_n+1] & firmshares0101[_n+1]!=.
*145,045 changes
merge m:1 orgnr using ./1k_population_clean.dta, keepusing(orgnr)
drop _merge
*everything merged in
save ./3_shares_outstanding.dta, replace


use ./1k_population_clean.dta, clear
keep orgnr founded exityear final
expand 16
sort orgnr
bys orgnr: gen first=1 if _n==1
gen pyear=2003 if first==1
bys orgnr: replace pyear=pyear[_n-1]+1 if first==.
drop first
sort orgnr pyear 
rename pyear aar
save ./1z_full_matrix.dta, replace


*****CHECKS
use ./1z_full_matrix.dta, clear
***collect the number of shares from the firm registry file to double-check
merge 1:1 orgnr aar using "S:\Katja_Aksel\raw files\Foretaksinfo_1993-2018.dta", keepusing(antaksj)
drop if _merge==2
drop _merge
***and the number of shares from the tax returns
merge 1:1 orgnr aar using ./3_shares_outstanding.dta
drop _merge
sort orgnr aar
rename aar pyear
***and from the purchase file
merge 1:1 orgnr pyear using ./helpfile_shares_per_year.dta
drop _merge
*drop observations before the founding year
drop if firmshares3112==. & shares==. & pyear<=founded
sort orgnr pyear
**roll down from the beginning value of the next year (in case there were some adjustments in shares)
bys orgnr: replace firmshares3112=firmshares0101[_n+1] if firmshares3112!=firmshares0101[_n+1] & firmshares0101[_n+1]!=.
*2,075 changes
replace shares=0 if shares==.
bys orgnr: gen first=1 if _n==1
***replace end of year shares by newly issued shares from the purchase file and from the registry in case missing values
replace firmshares3112=shares if first==1 & firmshares3112==. & shares!=0
*305 changes
replace firmshares3112=antaksj if first==1 & firmshares3112==. & antaksj!=.
*0 changes
sort orgnr pyear
**roll over from the previous year + newly issued shares in case missing values
bys orgnr: replace firmshares3112=firmshares3112[_n-1]+shares if firmshares3112==. & firmshares3112[_n-1]!=. & first==.
*178,512 changes (but also after exit events, for the full matrix)
drop first
sort orgnr pyear
*replace the beginning of the year number of shares if missing value
bys orgnr: replace firmshares0101=firmshares3112[_n-1] if firmshares0101==. & firmshares3112[_n-1]!=.
keep orgnr pyear firmshares* 
*take care of bankruptcies
bys orgnr: replace firmshares3112=firmshares3112[_n-1] if (firmshares3112<=1 | firmshares3112==.) & firmshares3112[_n-1]>=1 & firmshares3112[_n-1]!=.
*928 changes
bys orgnr: replace firmshares0101=firmshares3112[_n-1] if (firmshares0101<=1 | firmshares0101==.) & firmshares3112[_n-1]!=. 
*757 changes
**if missing value in beginning of the year, assume 0
replace firmshares0101=0 if firmshares0101==.
*1,620 changes
sum firmshares3112, d
sum firmshares0101, d

**check if firms have never reported shares
bys orgnr: egen test=max(firmshares3112)
drop if test==0
drop test
*12,445 observations dropped
save ./4_shares_outstanding.dta, replace

****add this information back to the purchase file
use ./2b_all_purchases.dta, clear
keep orgnr pdate
duplicates drop
**add number of newly issued shares per date
merge m:1 orgnr pdate using ./helpfile_shares_per_date.dta 
drop _merge
sort orgnr pdate
*for the dates of secondary trade where no new shares are issued
replace dateshares=0 if dateshares==.
gen pyear=year(pdate)
bys orgnr pyear: gen first=1 if _n==1
*how many new shares are issued cumulatively per year
gen new=dateshares if first==1
bys orgnr pyear: replace new=new[_n-1]+dateshares if new==.
drop first
*merge in the firm data on shares outstanding
merge m:1 orgnr pyear using ./4_shares_outstanding.dta
keep if _merge==3
*2,080 purchase observations do not have the firm number of shares
drop _merge dateshares
*define number of shares outstanding at each purchase date (after that purchase)
gen outstanding=firmshares0101+new
keep orgnr pdate outstanding
merge 1:m orgnr pdate using ./2b_all_purchases.dta
keep if _merge==3
drop _merge
*generate the ownership variable
gen ownership=shares/outstanding
save ./2c_all_purchases.dta, replace 


